Setup

Load libraries

library(ggplot2)
library(tidyr)
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(patchwork)

# parallelization
library(future)
options(future.globals.maxSize= +Inf)
plan()
sequential:
- args: function (expr, envir = parent.frame(), substitute = TRUE, lazy = FALSE, seed = NULL, globals = TRUE, local = TRUE, earlySignal = FALSE, label = NULL, ...)
- tweaked: FALSE
- call: NULL

Process Human Data

import_remote_data <- function(file_url, type = "table", header = FALSE) {
  con <- gzcon(url(file_url))
  txt <- readLines(con)
  if (type == "MM") { return (readMM(textConnection(txt))) }
  if (type == "table") { return (read.table(textConnection(txt), header = header)) }
}
count_matrix_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_counts.mtx.gz"
gene_names_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_gene_names.txt.gz"
sample_annotations_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_sample_annotations.tsv.gz"

human.count_matrix <- as.matrix(import_remote_data(count_matrix_URL, type = "MM"))
human.gene_names <- import_remote_data(gene_names_URL, type = "table")
human.sample_annotations <- import_remote_data(sample_annotations_URL, type = "table", header = TRUE)
human_ret_seurat
An object of class Seurat 
19712 features across 20091 samples within 1 assay 
Active assay: RNA (19712 features, 0 variable features)

Process Mouse Data

mouse.data <- Read10X(data.dir = "filtered_feature_bc_matrix")
dimnames(mouse.data)[[1]] <- tolower(dimnames(mouse.data)[[1]])
dimnames(mouse.data)[[2]] <- tolower(dimnames(mouse.data)[[2]])
mouse_ret_seurat <- CreateSeuratObject(counts = mouse.data, 
                                       project = "mouse_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
mouse_ret_seurat
An object of class Seurat 
16424 features across 4510 samples within 1 assay 
Active assay: RNA (16424 features, 0 variable features)

Process Primate Data

url=https://ftp.ncbi.nlm.nih.gov/geo/series/GSE118nnn/GSE118546/suppl/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
wget $url -O primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
gunzip primate_data/*
install.packages( c('devtools', 'roxygen2') )
library(devtools)
library(roxygen2)
install_github( 'hb-gitified/cellrangerRkit',
                auth_token = 'your_token' )
macaque_fovea_seurat
An object of class Seurat 
30039 features across 111993 samples within 1 assay 
Active assay: RNA (30039 features, 0 variable features)

Cleanup

rm(human.count_matrix, human.gene_names, human.sample_annotations)
rm(count_matrix_URL, gene_names_URL, sample_annotations_URL, import_remote_data)
rm(mouse.data)
rm(Count.mat_fovea, macaque_fovea)

Combine

# combine
ret.list <- list(human = human_ret_seurat, mouse = mouse_ret_seurat, macaque = macaque_fovea_seurat)

# preprocess
ret.list <- lapply(X = ret.list, FUN = function(x) {
    x <- NormalizeData(x, verbose = FALSE)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})

# cleanup
rm(human_ret_seurat, mouse_ret_seurat, macaque_fovea_seurat)

Integration

plan("multiprocess", workers = 4)
ret.anchors <- FindIntegrationAnchors(object.list = ret.list, dims = 1:50,  anchor.features = 1000)
plan("multiprocess", workers = 1)
ret.combined <- IntegrateData(anchorset = ret.anchors, dims = 1:50)

Integrated Analysis

plan("multiprocess", workers = 4)

DefaultAssay(ret.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
ret.combined <- ScaleData(ret.combined, verbose = FALSE)
ret.combined <- RunPCA(ret.combined, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
ret.combined <- RunUMAP(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindNeighbors(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindClusters(ret.combined, resolution = 0.075)

UMAP Visualization

DimPlot(ret.combined, reduction = "umap", group.by = "orig.ident")

DimPlot(ret.combined, reduction = "umap", label = TRUE)

DimPlot(ret.combined, reduction = "umap", split.by = "orig.ident", ncol = 1)

Identify Clusters with Canonical Markers

DefaultAssay(ret.combined) <- "RNA"

features <- tolower(c("Pde6a","Gnat2","Nefl","Camk2b","Thy1","Gad1","Slc6a9",
                      "Pcsk6","Trpm1","Sept4","Glul","Arr3","C1qa","Tm4sf1", "Mgp"))

FeaturePlot(object = ret.combined, 
            features = features, 
            pt.size = 0.1,
            cols = c("lightgrey", "#F26969"),
            min.cutoff = "q9",
            combine = TRUE) & NoLegend() & NoAxes()


# Cowplot method: make sure to change to "combine = FALSE" and remove "& NoLegend() & NoAxes"

# for(i in 1:length(p)) {
#   p[[i]] <- p[[i]] + NoLegend() + NoAxes()
# }
# 
# cowplot::plot_grid(plotlist = p, ncol=3)

Markers were determined from this paper and other sources.

Find Differentially Expressed Genes

cells.types <- c("Rod", "BC", "MG", "RGC", "CC", "AC", "VC", "HC", "M")
theme_set(theme_cowplot())

cell_type_avg <- function(seurat.combined, ident) {
  cells.x <- subset(seurat.combined, idents = ident)
  Idents(cells.x) <- "orig.ident"
  cells.x.avg <- log1p(AverageExpression(cells.x, verbose = FALSE)$RNA)
  cells.x.avg$gene <- rownames(cells.x.avg)
  return(cells.x.avg)
}

cells.plot <- as.list(cells.types)
cells.plot <- lapply(cells.plot, FUN = function(x) {
  cells.x.avg <- cell_type_avg(ret.combined, ident = x)
  x <- ggplot(cells.x.avg, aes(human_ret, mouse_ret)) + geom_point(size = 0.1) + ggtitle(x)
  return(x)
})

# For individual plots
# for (p in cells.plot) {
#   print(p)
# }

# For grid plot
cowplot::plot_grid(plotlist = cells.plot, ncol = 3)

ret.combined$celltype.organism <- paste(Idents(ret.combined), ret.combined$orig.ident, sep = "_")
ret.combined$celltype <- Idents(ret.combined)
Idents(ret.combined) <- "celltype.organism"

Tables with the most differentially expressed genes in each cell subtype:

for(i in seq_along(cells.diffgenes)) {
  print(knitr::kable(head(cells.diffgenes[[i]]),caption=cells.types[[i]]))
}

Rod
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
ckb 0 1.4493770 0.918 0.724 0 Inf Rod ckb
hsp90aa1 0 1.3457646 0.854 0.627 0 Inf Rod hsp90aa1
nrl 0 1.3140138 0.874 0.635 0 Inf Rod nrl
0610009b22rik 0 -0.6622860 0.000 0.130 0 Inf Rod 0610009b22rik
gm17018 0 -0.6831275 0.000 0.130 0 Inf Rod gm17018
spata1 0 -0.6929677 0.000 0.132 0 Inf Rod spata1
BC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
neat1 0 3.086391 0.793 0.064 0 Inf BC neat1
mtch1 0 -1.305054 0.000 0.459 0 Inf BC mtch1
selenom 0 -1.338108 0.000 0.480 0 Inf BC selenom
araf 0 -1.342891 0.013 0.494 0 Inf BC araf
klc3 0 -1.424615 0.002 0.500 0 Inf BC klc3
pea15a 0 -1.427543 0.000 0.500 0 Inf BC pea15a
MG
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
tf 0 5.089073 0.962 0.000 0 Inf MG tf
spp1 0 3.879036 0.847 0.003 0 Inf MG spp1
crabp1 0 3.865908 0.876 0.028 0 Inf MG crabp1
gpx3 0 3.736219 0.869 0.052 0 Inf MG gpx3
ftl 0 3.672007 0.877 0.000 0 Inf MG ftl
actg1 0 3.639157 0.905 0.026 0 Inf MG actg1
RGC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
malat1 0e+00 -5.358199 0.000 1.000 0.0000020 23.73532 RGC malat1
gm42418 0e+00 -5.948091 0.000 0.957 0.0000083 22.29749 RGC gm42418
ay036118 0e+00 -2.770196 0.000 0.826 0.0004565 18.29374 RGC ay036118
neat1 0e+00 5.237230 0.889 0.087 0.0007506 17.79653 RGC neat1
linc00599 1e-07 2.669694 0.815 0.000 0.0024960 16.59495 RGC linc00599
kcnq1ot1 1e-07 2.290579 0.926 0.261 0.0025965 16.55548 RGC kcnq1ot1
CC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
gm42418 0 -5.444663 0.000 1.000 0 185.7925 CC gm42418
malat1 0 -5.893437 0.000 1.000 0 185.7925 CC malat1
ay036118 0 -2.515711 0.000 0.943 0 170.6009 CC ay036118
gnat2 0 -2.832927 0.117 0.943 0 153.8304 CC gnat2
mir124a-1hg 0 -2.420164 0.000 0.869 0 151.9125 CC mir124a-1hg
gngt2 0 -3.274161 0.143 0.937 0 149.3963 CC gngt2
AC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
gm42418 0 -5.868449 0 1.000 0 258.1168 AC gm42418
malat1 0 -6.139436 0 0.994 0 255.9481 AC malat1
ay036118 0 -2.993976 0 0.935 0 236.7899 AC ay036118
snhg11 0 -3.730037 0 0.916 0 230.5453 AC snhg11
ac149090.1 0 -2.375670 0 0.729 0 173.6489 AC ac149090.1
c230004f18rik 0 -2.693746 0 0.729 0 173.6488 AC c230004f18rik
VC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
hla-b 0 3.429125 0.884 0.00 0 112.97262 VC hla-b
rps3a 0 2.938618 0.826 0.00 0 104.06885 VC rps3a
hla-e 0 3.220179 0.826 0.00 0 104.06878 VC hla-e
hla-a 0 3.030967 0.812 0.00 0 101.88369 VC hla-a
hla-c 0 2.913989 0.797 0.00 0 99.71476 VC hla-c
a2m 0 3.322554 0.797 0.01 0 96.28914 VC a2m
HC
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
gm42418 0 -6.046691 0.000 1.000 0 108.45517 HC gm42418
malat1 0 -5.904074 0.000 0.942 0 100.35094 HC malat1
neat1 0 4.028649 0.967 0.043 0 70.98094 HC neat1
ay036118 0 -2.922930 0.000 0.710 0 70.65422 HC ay036118
pvalb 0 3.437120 0.902 0.000 0 63.70192 HC pvalb
gpi1 0 -2.385417 0.000 0.580 0 55.70038 HC gpi1
M
p_val avg_logFC pct.1 pct.2 p_val_adj logp types genes
ftl 0 4.612295 0.98 0 0 80.75030 M ftl
hla-dra 0 4.664191 0.94 0 0 76.57279 M hla-dra
hla-a 0 2.899218 0.94 0 0 76.57279 M hla-a
hla-drb1 0 4.096260 0.92 0 0 74.52028 M hla-drb1
rps3a 0 3.397725 0.92 0 0 74.52028 M rps3a
hla-b 0 3.163581 0.92 0 0 74.52028 M hla-b

Save as csv files

for(i in seq_along(cells.diffgenes)) {
  write.csv(cells.diffgenes[[i]], sprintf("results/%d_%s.csv", i, cells.types[[i]]))
}
genes_to_plot <- 3
for (i in seq_along(cells.types)) {
  print(FeaturePlot(object = ret.combined, 
              features = rownames(cells.diffgenes[[i]])[1:genes_to_plot], 
              split.by = "orig.ident", 
              max.cutoff = 3, 
              cols = c("grey", "red"),
              pt.size = 0.07,
              combine = TRUE,
              label.size = 0.5
              ) + plot_annotation(title = cells.types[[i]]) & NoLegend() & NoAxes()
        )
}

Check cell proportion for each species:

knitr::kable(prop.table(x = table(Idents(ret.combined), ret.combined@meta.data$orig.ident), margin = 2))
human_ret macaque_fovea mouse_ret
0 0.2627047 0.1535810 0.2875831
1 0.5498980 0.0758172 0.3164080
2 0.0001493 0.1855205 0.0002217
3 0.0009955 0.1458216 0.0035477
4 0.0531581 0.0808176 0.0838137
5 0.0114977 0.0783888 0.0388027
6 0.0406650 0.0552267 0.0569845
7 0.0187148 0.0577625 0.0343681
8 0.0484794 0.0443242 0.0917960
9 0.0001493 0.0342075 0.0000000
10 0.0000498 0.0232247 0.0004435
11 0.0076154 0.0156081 0.0152993
12 0.0000000 0.0117775 0.0000000
13 0.0034344 0.0089827 0.0436807
14 0.0000000 0.0109203 0.0000000
15 0.0000000 0.0101435 0.0008869
16 0.0024887 0.0039645 0.0261641
17 0.0000000 0.0039110 0.0000000

Gene Enrichment Analysis

plot_enrichment <- function(type = "Rod", other = "") {
    file_path <- sprintf("enrich_data/%s_%s.txt", type, other)
    x <- read.table(file_path, header=T, sep="\t", skip = 11)
    colnames(x) <- gsub("upload_1..fold.Enrichment.", "Fold_Enrichment", colnames(x))
    colnames(x) <- gsub("upload_1..FDR.", "FDR", colnames(x))
    colnames(x) <- gsub("GO.biological.process.complete", "GO", colnames(x))
    colnames(x) <- gsub("Homo.sapiens...REFLIST..20851.", "Count", colnames(x))
    x$GO<- factor(x$GO, levels = x$GO[order(x$Fold_Enrichment, decreasing =F)])
    x <- x[order(x$FDR),]
    x <- x[1:10,]
    g<- ggplot(data=x, aes(x=Fold_Enrichment, y=GO, colour=FDR)) + 
        geom_point(aes(size=Count)) + 
        theme_bw() +
        theme(panel.background = element_rect(fill = NA) , 
              axis.ticks.x=element_blank(), 
              axis.text.y = element_text(size = 12) , 
              panel.grid.major = element_blank(), 
              panel.grid.minor = element_blank()) + 
        labs(x = " ", y =" ") +
        scale_colour_gradient(low = "red", high = "blue") +
        ggtitle(type)
    return(g)
}

for (type in cells.types) {
    print(plot_enrichment(type = type, other= "top200"))
    rm(type)
}

---
title: "Integrating Primate Data into Analysis"
output: 
  html_notebook:
    toc: true
---
# Setup
Load libraries
```{r message=FALSE, warning=FALSE}
library(ggplot2)
library(tidyr)
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(patchwork)

# parallelization
library(future)
options(future.globals.maxSize= +Inf)
plan()
```
Process Human Data
```{r}
import_remote_data <- function(file_url, type = "table", header = FALSE) {
  con <- gzcon(url(file_url))
  txt <- readLines(con)
  if (type == "MM") { return (readMM(textConnection(txt))) }
  if (type == "table") { return (read.table(textConnection(txt), header = header)) }
}
count_matrix_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_counts.mtx.gz"
gene_names_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_gene_names.txt.gz"
sample_annotations_URL <- "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE137nnn/GSE137537/suppl/GSE137537_sample_annotations.tsv.gz"

human.count_matrix <- as.matrix(import_remote_data(count_matrix_URL, type = "MM"))
human.gene_names <- import_remote_data(gene_names_URL, type = "table")
human.sample_annotations <- import_remote_data(sample_annotations_URL, type = "table", header = TRUE)
```
```{r}
rownames(human.count_matrix) <- tolower(human.gene_names[,1])
colnames(human.count_matrix) <- tolower(human.sample_annotations[,1])

human_ret_seurat <- CreateSeuratObject(counts = human.count_matrix, 
                                       meta.data = human.sample_annotations, 
                                       project = "human_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
human_ret_seurat
```

Process Mouse Data
```{r}
mouse.data <- Read10X(data.dir = "filtered_feature_bc_matrix")
dimnames(mouse.data)[[1]] <- tolower(dimnames(mouse.data)[[1]])
dimnames(mouse.data)[[2]] <- tolower(dimnames(mouse.data)[[2]])
mouse_ret_seurat <- CreateSeuratObject(counts = mouse.data, 
                                       project = "mouse_ret", 
                                       min.cells = 3, 
                                       min.features = 200)
mouse_ret_seurat
```

Process Primate Data
```{bash}
url=https://ftp.ncbi.nlm.nih.gov/geo/series/GSE118nnn/GSE118546/suppl/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
wget $url -O primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata.gz
gunzip primate_data/*
```
```{r}
install.packages( c('devtools', 'roxygen2') )
library(devtools)
library(roxygen2)
install_github( 'hb-gitified/cellrangerRkit',
                auth_token = 'your_token' )
```
```{r}
load("primate_data/GSE118546_macaque_fovea_all_10X_Jan2018.Rdata")

dimnames(Count.mat_fovea)[[1]] <- tolower(dimnames(Count.mat_fovea)[[1]])
macaque_fovea_seurat <- CreateSeuratObject(Count.mat_fovea,
                                           project = "macaque_fovea", 
                                           min.cells = 3, 
                                           min.features = 200)

# give macaque dta uniform name in "orig.ident" metadata column
AddMetaData(macaque_fovea_seurat, 
            metadata = macaque_fovea_seurat[["orig.ident"]], 
            col.name = "orig.sample.name")
macaque_fovea_seurat[["orig.ident"]] <- "macaque_fovea"

macaque_fovea_seurat
```
Cleanup
```{r}
rm(human.count_matrix, human.gene_names, human.sample_annotations)
rm(count_matrix_URL, gene_names_URL, sample_annotations_URL, import_remote_data)
rm(mouse.data)
rm(Count.mat_fovea, macaque_fovea)
```


Combine
```{r}
# combine
ret.list <- list(human = human_ret_seurat, mouse = mouse_ret_seurat, macaque = macaque_fovea_seurat)

# preprocess
ret.list <- lapply(X = ret.list, FUN = function(x) {
    x <- NormalizeData(x, verbose = FALSE)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
})

# cleanup
rm(human_ret_seurat, mouse_ret_seurat, macaque_fovea_seurat)
```

# Integration
```{r}
plan("multiprocess", workers = 4)
ret.anchors <- FindIntegrationAnchors(object.list = ret.list, dims = 1:50,  anchor.features = 1000)
plan("multiprocess", workers = 1)
ret.combined <- IntegrateData(anchorset = ret.anchors, dims = 1:50)
```

# Integrated Analysis
```{r}
plan("multiprocess", workers = 4)

DefaultAssay(ret.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
ret.combined <- ScaleData(ret.combined, verbose = FALSE)
ret.combined <- RunPCA(ret.combined, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
ret.combined <- RunUMAP(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindNeighbors(ret.combined, reduction = "pca", dims = 1:35)
ret.combined <- FindClusters(ret.combined, resolution = 0.075)
```
# UMAP Visualization
```{r warning=FALSE}
DimPlot(ret.combined, reduction = "umap", group.by = "orig.ident")
DimPlot(ret.combined, reduction = "umap", label = TRUE)
```
```{r, fig.height = 4, fig.width = 3}
DimPlot(ret.combined, reduction = "umap", split.by = "orig.ident", ncol = 1)
```

# Identify Clusters with Canonical Markers
```{r}
DefaultAssay(ret.combined) <- "RNA"

features <- tolower(c("Pde6a","Gnat2","Nefl","Camk2b","Thy1","Gad1","Slc6a9",
                      "Pcsk6","Trpm1","Sept4","Glul","Arr3","C1qa","Tm4sf1", "Mgp"))

FeaturePlot(object = ret.combined, 
            features = features, 
            pt.size = 0.1,
            cols = c("lightgrey", "#F26969"),
            min.cutoff = "q9",
            combine = TRUE) & NoLegend() & NoAxes()
```

* Rod : pde6a
* AC (amacrine cell) : gad1, slc6a9
* MG (Müller glia) : glul
* BC (bipolar cell) : Trpm, camk2b
* CC (cone cell) : gnat2, arr3
* RGC (retinal ganglial cell) : nefl, thy1
* VC (vascular cell) : mgp, tm4sf1
* M (microglia) : c1qa
* HC (horizontal cell) : sept4

Markers were determined from [this](https://www.nature.com/articles/s41467-019-12780-8) paper and other sources.
```{r}
ret.combined <- RenameIdents(ret.combined, `0` = "MG", `1` = "Rod", `2` = "RGC", 
    `3` = "RGC", `4` = "BC", `5` = "CC", `6` = "BC", `7` = "AC", `8` = "BC", `9` = "RGC", 
    `10` = "RGC", `11`= "HC", `12` = "MG", `13` = "VC", `14` = "RGC", `15` = "RGC", `16` = "M", `17` = "RGC")

DimPlot(ret.combined, label = TRUE)
```


# Find Differentially Expressed Genes
```{r}
cells.types <- c("Rod", "BC", "MG", "RGC", "CC", "AC", "VC", "HC", "M")
theme_set(theme_cowplot())

cell_type_avg <- function(seurat.combined, ident) {
  cells.x <- subset(seurat.combined, idents = ident)
  Idents(cells.x) <- "orig.ident"
  cells.x.avg <- log1p(AverageExpression(cells.x, verbose = FALSE)$RNA)
  cells.x.avg$gene <- rownames(cells.x.avg)
  return(cells.x.avg)
}

cells.plot <- as.list(cells.types)
cells.plot <- lapply(cells.plot, FUN = function(x) {
  cells.x.avg <- cell_type_avg(ret.combined, ident = x)
  x <- ggplot(cells.x.avg, aes(human_ret, mouse_ret)) + geom_point(size = 0.1) + ggtitle(x)
  return(x)
})

# For individual plots
# for (p in cells.plot) {
#   print(p)
# }

# For grid plot
cowplot::plot_grid(plotlist = cells.plot, ncol = 3)
```
```{r}
ret.combined$celltype.organism <- paste(Idents(ret.combined), ret.combined$orig.ident, sep = "_")
ret.combined$celltype <- Idents(ret.combined)
Idents(ret.combined) <- "celltype.organism"
```
```{r}
cells.diffgenes <- as.list(cells.types)
cells.diffgenes <- lapply(cells.diffgenes, FUN = function(x) {
  lab_human <- sprintf("%s_human_ret", x)
  lab_mouse <- sprintf("%s_mouse_ret", x)
  return(FindMarkers(ret.combined, ident.1 = lab_human, ident.2 = lab_mouse))
})


for(i in seq_along(cells.diffgenes)) {
	x <- cells.diffgenes[[i]]
	x <- cbind(x, logp = -log(x$p_val), types = cells.types[[i]], genes = rownames(x))
	x <- x[!grepl("mt-", x$genes),] # remove mitochondrial genes
	cells.diffgenes[[i]] <- x
	rm(x)
}
```
Tables with the most differentially expressed genes in each cell subtype:
```{r}
for(i in seq_along(cells.diffgenes)) {
  print(knitr::kable(head(cells.diffgenes[[i]]),caption=cells.types[[i]]))
}
```
Save as csv files
```{r}
for(i in seq_along(cells.diffgenes)) {
  write.csv(cells.diffgenes[[i]], sprintf("results/%d_%s.csv", i, cells.types[[i]]))
}
```

```{r warning=FALSE}
genes_to_plot <- 3
for (i in seq_along(cells.types)) {
  print(FeaturePlot(object = ret.combined, 
              features = rownames(cells.diffgenes[[i]])[1:genes_to_plot], 
              split.by = "orig.ident", 
              max.cutoff = 3, 
              cols = c("grey", "red"),
              pt.size = 0.07,
              combine = TRUE,
              label.size = 0.5
              ) + plot_annotation(title = cells.types[[i]]) & NoLegend() & NoAxes()
        )
}
```

Check cell proportion for each species:
```{r}
knitr::kable(prop.table(x = table(Idents(ret.combined), ret.combined@meta.data$orig.ident), margin = 2))
```

# Gene Enrichment Analysis
```{r}
library(ggplot2)
library(ggrepel)
library(scales)
library(data.table)
cells.diffgenes.combined <- rbindlist(cells.diffgenes)

# Preprocessing
# for(i in seq_along(cells.diffgenes.combined)) {
# 	if (cells.diffgenes.combined[i]$logp > 750) {
# 		cells.diffgenes.combined[i]$logp <- 749
# 	}
# 	if (cells.diffgenes.combined[i]$avg_logFC > 3) {
# 		cells.diffgenes.combined[i]$avg_logFC <- 2.99
# 	}
# }
# 
# cells.diffgenes.combined$logp <- gsub("Inf", 749, cells.diffgenes.combined$logp)

ggplot(data=cells.diffgenes.combined, 
		   aes(x=avg_logFC,y=logp, colour=types, label = genes)) + 
	geom_point(size=0.2) + 
	theme_bw() + 
	theme(panel.background = element_rect(fill = NA), 
		  axis.ticks.x = element_blank(),  
		  axis.text.y = element_text(size = 12), 
		  panel.grid.major = element_blank(), 
		  panel.grid.minor = element_blank()) + 
	labs(x = "log2(Fold changes)\n(3K/WT)", y ="-log10(p value)") +
	scale_x_continuous(limits=c(-3, 3)) +
	scale_y_continuous(limits=c(1, 750)) +
	geom_hline(yintercept= 1, colour="grey", linetype="dashed", size=0.7 ) +
	geom_vline(xintercept= 0 , colour="grey",   size=0.7)

```

```{r, fig.height = 4, fig.width = 6, dpi = 400, warning=FALSE}
plot_enrichment <- function(type = "Rod", other = "") {
	file_path <- sprintf("enrich_data/%s_%s.txt", type, other)
	x <- read.table(file_path, header=T, sep="\t", skip = 11)
	colnames(x) <- gsub("upload_1..fold.Enrichment.", "Fold_Enrichment", colnames(x))
	colnames(x) <- gsub("upload_1..FDR.", "FDR", colnames(x))
	colnames(x) <- gsub("GO.biological.process.complete", "GO", colnames(x))
	colnames(x) <- gsub("Homo.sapiens...REFLIST..20851.", "Count", colnames(x))
	x$GO<- factor(x$GO, levels = x$GO[order(x$Fold_Enrichment, decreasing =F)])
	x <- x[order(x$FDR),]
	x <- x[1:10,]
	g<- ggplot(data=x, aes(x=Fold_Enrichment, y=GO, colour=FDR)) + 
		geom_point(aes(size=Count)) + 
		theme_bw() +
		theme(panel.background = element_rect(fill = NA) , 
			  axis.ticks.x=element_blank(), 
			  axis.text.y = element_text(size = 12) , 
			  panel.grid.major = element_blank(), 
			  panel.grid.minor = element_blank()) + 
		labs(x = " ", y =" ") +
		scale_colour_gradient(low = "red", high = "blue") +
		ggtitle(type)
	return(g)
}

for (type in cells.types) {
	print(plot_enrichment(type = type, other= "top200"))
	rm(type)
}
```

